# set working directory ---------------------------------------------------
setwd()

# load packages -----------------------------------------------------------
library(bibliometrix)
library(stringr)
library(stringi)
library(dplyr)
library(igraph)
library(SuperExactTest)
library(car)
library(plotly)
library(broom)
library(multcompView)
library(CINNA)
library(scales)
library(qgraph)
library(XGR)

# import map file from VOSviewer ------------------------------------------
network <- read.delim2(file = "map.txt")

# import bibliographic data from Web of Science ---------------------------
refs_import <- readLines("savedrecs.bib")

# convert data to data frame
refs <- isibib2df(refs_import)

# generate descriptive statistics for entire dataset ----------------------
overall_results <- biblioAnalysis(refs, sep = ";")

# productivity and collaboration index over time --------------------------
year <- c()
ci <- c()
num <- c()
co <- c()

# run through dataset and extract CI and productivity for each year
years_tally <- 1990
cat("Year", "\t", "CI", "\t", "Art.", "\t", "Auth.", "\n")
while(years_tally <= 2017){
  results <- biblioAnalysis(subset(refs, PY == years_tally), sep = ";")
  output <- capture.output(try(summary(results), silent = TRUE))
  year <- c(year, str_extract(output[9], "\\-*\\d+\\.*\\d*"))
  ci <- c(ci, str_extract(output[21], "\\-*\\d+\\.*\\d*"))
  num <- c(num, str_extract(output[5], "\\-*\\d+\\.*\\d*"))
  co <- c(co, length(unique(names(results$Countries))))
  years_tally <- years_tally + 1
}

# convert all values to numeric
year <- as.numeric(year)
ci <- as.numeric(ci)
num <- as.numeric(num)
co <- as.numeric(co)

# run regressions on data
prod_ci <- data.frame(year, ci, num, co)
ci_regression <- lm(formula = ci ~ year, data = prod_ci)
num_regression <- lm(formula = log(num) ~ year, data = prod_ci)
co_regression <- lm(formula = co ~ year, data = prod_ci)

# check for outliers in regression
ci_outliers <- outlierTest(ci_regression)
num_outliers <- outlierTest(num_regression)
co_outliers <- outlierTest(co_regression)

# CI #7 (1996) is an outlier, so remove that point and rerun regression without it
ci_mod <- ci[-7]
year_mod <- year[-7]
prod_ci_mod <- data.frame(year_mod, ci_mod)
ci_regression_mod <- lm(formula = ci_mod ~ year_mod, data = prod_ci_mod)

# lotka’s law -------------------------------------------------------------
lotka <- lotka(overall_results)

# compile articles in each group (cluster) --------------------------------
num_clust <- max(network$cluster)

# get all names for authors in each group 
# go through and convert author names to those compatible with map file
authors_raw <- refs$AU
authors_raw <- tolower(authors_raw)
authors_raw <- strsplit(authors_raw, ";")
i <- 1
authors <- list()
while(i <= length(refs$AU)){
  temp <- trimws(unlist(authors_raw[i]), which = "both")
  temp <- gsub(" ", ", ", temp)
  m <- regexpr(".*(, .{1})", temp)
  temp <- regmatches(temp, m)
  temp <- gsub("'", "", temp)
  
  authors[i] <- list(temp)
  
  i <- i + 1
}

# get cluster identity for each article with authors in only one cluster
i <- 1
group_index <- c()
while(i <= length(refs$AU)){
  # get unique clusters associated with each article
  temp <- unique(network$cluster[which(as.character(network$label) %in% unlist(authors[i]))])
  # if there is only one cluster associated with the article, put the cluster number into the appropriate index
  if(length(temp) == 1){
    group_index[i] <- temp
  }
  else{
    group_index[i] <- NA
  }
  i <- i + 1
}

# generate descriptive statistics for each group
i <- 1
group_results <- list()
group_ci <- list()
while(i <= num_clust){
  temp <- biblioAnalysis(refs[which(group_index == i), ], sep = ";")
  group_results[[i]] <- temp
  
  num_auth <- group_results[[i]]$AuMultiAuthoredArt
  num_pap <- group_results[[i]]$nAUperPaper[group_results[[i]]$nAUperPaper >= 2]
  group_ci[[i]] <- num_auth/length(num_pap)
  
  i <- i + 1
}

# calculate h-indices, and run anova and pairwise t-tests
i <- 1
group_h_indices <- data.frame(h = c(), group = c())
while(i <= num_clust){
  temp <- gsub(",", " ", unique(str_extract(unlist(names(group_results[[i]]$Authors)), ".*,[A-Z]{1}")))
  temp <- temp[!is.na(temp)]
  temp <- Hindex(refs, temp, sep = ";", years = max(overall_results$Years)-min(overall_results$Years))
  temp <- temp$H$h_index
  
  temp <- data.frame(h = temp, group = rep(i, length(temp)))
  group_h_indices <- rbind(group_h_indices, temp)
  
  i <- i + 1
}

h_anova <- aov(group_h_indices$h ~ factor(group_h_indices$group))
h_pairwise_t <- pairwise.t.test(group_h_indices$h, group_h_indices$group, p.adjust.method = "none")

# centrality measures -----------------------------------------------------
# this entire analysis requires that you export a Pajek partition file and a Pajek network file from VOSviewer
# read in partition file
partition <- read.delim2("partition.clu")
clusters <- partition[,1]
clusters <- as.character(clusters)

# read in network file
net <- read.graph("network.net", format = c("pajek"))

# calculate network density (proportion of actual edges to possible edges)
d <- edge_density(net)

# calculate smallworldness
sw <- smallworldness(net, B = 1000)

# determine best centrality measures of the classic four
pr_cent <- proper_centralities(net)
cal_cent <- calculate_centralities(net, include = pr_cent[c(7, 10, 15, 26)])
pca_cent <- pca_centralities(cal_cent)
pca_cent

eigen <- as.numeric(cal_cent$`eigenvector centralities`)
closeness <- as.numeric(cal_cent$`Closeness Centrality (Freeman)`)

# rescale variables
eigen <- rescale(eigen, to = c(0, 1))
eigen <- data.frame(clusters, eigen)

closeness <- rescale(closeness, to = c(0, 1))
closeness <- data.frame(clusters, closeness)

# run anovas and pairwise t tests
eigen_anova <- aov(eigen$eigen ~ factor(eigen$clusters))
eigen_pairwise_t <- pairwise.t.test(eigen$eigen, eigen$clusters, p.adjust.method = "holm")

closeness_anova <- aov(closeness$closeness ~ factor(closeness$clusters))
closeness_pairwise_t <- pairwise.t.test(closeness$closeness, closeness$clusters, p.adjust.method = "none")

# super exact test --------------------------------------------------------
# split DOI's by group
i <- 1
group_doi <- list()
while(i <= num_clust){
  temp <- refs[which(group_index == i), ]
  temp <- unique(unlist(strsplit(temp$CR, "   ")))
  temp <- temp[grepl("DOI", temp)]
  temp <- unique(gsub(".*DOI ", "", temp))
  
  group_doi[[i]] <- temp
  
  i <- i + 1
}

# calculate background population size based on total number of references with DOI's in dataset
background_pop_size <- unlist(strsplit(refs$CR, "   "))
background_pop_size <- background_pop_size[grepl("DOI", background_pop_size)]
background_pop_size <- length(background_pop_size)

# run super exact test
set_input <- list("1" = group_doi[[1]], "2" = group_doi[[2]], "3" = group_doi[[3]], "4" = group_doi[[4]], "5" = group_doi[[5]], "6" = group_doi[[6]], "7" = group_doi[[7]])
super_exact_results <- supertest(set_input, n = background_pop_size, degree = 2)